knitr::opts_chunk$set(echo=F, warning=F, message = F)
library(tidyverse)
library(recipes)
library(Data.Quality.Reports)
library(lubridate)

MISSING_TYPES <- c(NA, "", " ", "NA", "nan", "  /  /    ",
                   "NULL", "null", "Null")

Rates of missing observations by variable

``` {r results="asis"}

data <- params$path %>% readRDS() missing <- c(MISSING_TYPES, " / / ", "NULL", "null", "Null")

date.feature <- data[[params$date_variable]] location.feature <- data[[params$location_variable]]

missing.data <- data %>% select(-c(params$location_variable, params$date_variable)) %>% is.missing(missing)
missing.rates <- missing.data %>% apply(2, mean)

for (i in names(missing.rates)){ cat('\n')
cat(glue("{i}: {round(100*missing.rates[i], 3)}%\n")) cat("\n") }

## Summary of predictability of missing patterns based on location or date

Predictability is measured via the ROC, which ranges from 0.5 to 1, with 1 implying perfect predictability.  Data was divided into training and validation sets - so ROC represents predicatbility on data unseen during training.   Value of > 0.8 may be cuase for concern (see the figures that follow for interpretation of models).

``` {r}

missing.vars <- missing.rates[(missing.rates < 0.2) & (missing.rates > 0.001)] %>% 
  names()

date.missing <- lapply(
  missing.vars,
  function(i) {
    target <- data[[i]] %>% is.missing(missing)
    result <- feature.fit(target, date.feature)
  }
)

location.missing <- lapply(
  missing.vars,
  function(i) {
    target <- data[[i]] %>% is.missing(missing)
    feature.fit(target, location.feature)
  }
)
l <- length(missing.vars)
if ( l > 0){
  frm <- data.frame(
    Var = missing.vars,
    DateROC = sapply(date.missing, function(i) i$AUC)%>% round(3),
    LocROC = sapply(location.missing, function(i) i$AUC)%>% round(3)
  ) 
} else {
  frm <- data.frame(
    Var = NA,
    DateROC = NA,
    LocROC = NA
  )
}

frm %>% knitr::kable(caption="ROC-AUC values for location and date models.")
to.group <- function(values, result) {
  values <- values %>% str_replace_all("`", "")
  factor(result$naming_vector[values], levels = result$naming_vector)
}


generate_plot <- function(result, title=NA){
  plotdata <- varImp(result$model) %>% as.data.frame() %>% 
    mutate(Group = to.group(rownames(.), result))
  ncol <- (1 + nrow(plotdata) %/% 16)
  plotdata %>% 
    ggplot(aes(x=Group,  y=Score, color=Group)) + 
    geom_point() + guides(color=guide_legend(ncol=ncol,byrow=FALSE)) + 
    theme(axis.title.x=element_blank(),
                         axis.text.x=element_blank(),
                         axis.ticks.x=element_blank()) + 
    labs(title = title)
}

r params$date_variable on Missing

Figures to summarize Date on missing bias.

``` {r, fig.width=8} l <- length(missing.vars) if (l > 0){ plt <- purrr::map( 1:l, function(i) generate_plot(date.missing[[i]], missing.vars[i]) %>% print() ); }

## `r params$location_variable` on Missing

Figures to summarize Location  on missing bias.

``` {r, fig.width=12}

if (l > 0){ 
  plt <- purrr::map(
    1:l,
    function(i) generate_plot(location.missing[[i]], missing.vars[i]) %>%
      print()
  );
}


Stat-Cook/Data.Quality.Reports documentation built on May 4, 2022, 2:21 p.m.